In today’s lab class, we are going through the application of the concepts of regression, correlation, and PCA using the data sets built-in in R.

1) Requesting the data

#set mirror
local({r <- getOption("repos")
       r["CRAN"] <- "http://cran.r-project.org"
       options(repos=r)})

# Loading the example data

data(iris)
data(cars)
data(mtcars)
data(USJudgeRatings)

2) Regression

speed <- cars$speed
distance <- cars$dist

model.cars <- lm(distance ~ speed)

summary(model.cars)
## 
## Call:
## lm(formula = distance ~ speed)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.069  -9.525  -2.272   9.215  43.201 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17.5791     6.7584  -2.601   0.0123 *  
## speed         3.9324     0.4155   9.464 1.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared:  0.6511, Adjusted R-squared:  0.6438 
## F-statistic: 89.57 on 1 and 48 DF,  p-value: 1.49e-12
plot(x = speed,
     y = distance,
     xlim = c(min(speed), max(speed)),
     ylim = c(min(distance), max(distance)),
     pch = 19,
     main = "Speed of cars vs. Distance taken to stop")

text(x = min(speed) + sd(speed, na.rm = T)/2,
     y = max(distance)-(max(distance)/10),
     labels = paste0("R-squared = ", round(x = summary(model.cars)$r.squared, digits = 2)))

abline(model.cars, col = "red")

3) Correlation

# Installing and/or loading required packages

if(!require(package = "corrplot", quietly = T)){
  install.packages("corrplot")
}
## corrplot 0.92 loaded
library(corrplot)

# Scatterplot variable vs. variable - USJudgeRatings data

pairs(USJudgeRatings)

# Calculating the correlation matrix based on USJudgeRatings data

cor.USJudgeRatings <- cor(x = USJudgeRatings, method = "pearson")

# Plotting the matrix of correlation

corrplot(corr = cor.USJudgeRatings, method = "circle")

4) Use the Iris dataset and perform a principal components analysis of it using the following commands:

# Installing required packages

if(!require(package = "ggplot2", quietly = T)){
  install.packages("ggplot2")
}

if(!require(package = "ggfortify", quietly = T)){
  install.packages("ggfortify")
}
library(ggplot2)
library(ggfortify)

# Extracting numeric matrix from iris data

iris.data <- subset(x = iris, select = -Species)

# Extracting vector of species from iris data

iris.species <- subset(x = iris, select = Species)
iris.species <- iris.species$Species

# Computing PCA

iris.pca.prcmp <- prcomp(x = iris.data)

# Plotting PCA results using ggplot2

autoplot(iris.pca.prcmp, data = iris, colour = 'Species',
         loadings = TRUE, loadings.colour = 'blue',
         loadings.label = TRUE, loadings.label.size = 3)

5) Now, perform automated classification of this dataset using the following:

# Install and/or load the package e1071

if(!require(package = "e1071", quietly = T)){
  install.packages("e1071")
}

# Computing SVMs

model.svm <- svm(x = iris.data, y = iris.species, type="C")

# Computing and Plotting PCoA

plot(cmdscale(dist(iris.data)),
     col = as.integer(iris.species),
     pch = c("o","+")[1:150 %in% model.svm$index + 1])

pred <- predict(model.svm, iris.data)
table(pred, iris.species)
##             iris.species
## pred         setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          2        48

Include the table in your answer.

6) Now do the same types of analysis on the “mtcars” dataset (data from the Motor Trend reviews of cars in the 1974 model year, this is not the same data as previously discussed in class).

data(mtcars)
#fix(mtcars)
prcars<-prcomp(mtcars)
autoplot(prcars, data=mtcars, loadings=TRUE, loadings.label=TRUE)

Include the plot in your answer.

  1. Normalize the dataset
if(!require(package = "heatmaply", quietly = T)){
  install.packages("heatmaply")}
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
## 
## ======================
## Welcome to heatmaply version 1.4.2
## 
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
## 
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags: 
##   https://stackoverflow.com/questions/tagged/heatmaply
## ======================
library(heatmaply)

heatmaply(
  mtcars, 
  xlab = "Features",
  ylab = "Cars", 
  main = "Raw data"
)
heatmaply(
  normalize(mtcars),
  xlab = "Features",
  ylab = "Cars", 
  main = "Data Normalization"
)
scale_data <- as.data.frame(scale(mtcars))

prcars<-prcomp(scale_data)
autoplot(prcars, data=scale_data, loadings=TRUE, loadings.label=TRUE)

7) Run the R command

data()

Choose an appropriate additional dataset. Perform PCA and / or SVM and interpret the result.